dependent_var <- "MEDHVAL"
predictors <- c("PCTBACHMOR", "NBELPOV100", "PCTVACANT", "PCTSINGLES")
summary_stats <- data %>%
select(all_of(c(dependent_var, predictors))) %>%
summarise_all(list(Mean = mean, SD = sd), na.rm = TRUE) %>%
pivot_longer(cols = everything(), names_to = "Variable", values_to = "Value") %>%
separate(Variable, into = c("Variable", "Stat"), sep = "_") %>%
pivot_wider(names_from = Stat, values_from = Value)
summary_stats$Variable <- recode(summary_stats$Variable,
"MEDHVAL" = "Median House Value",
"NBELPOV100" = "# Households Living in Poverty",
"PCTBACHMOR" = "% of Individuals with Bachelor’s Degrees or Higher",
"PCTVACANT" = "% of Vacant Houses",
"PCTSINGLES" = "% of Single House Units"
)
summary_stats <- summary_stats %>%
mutate(
Mean = round(Mean, 2),
SD = round(SD, 2)
)
summary_stats <- summary_stats %>%
arrange(Variable == "Median House Value")
predictor_rows <- which(summary_stats$Variable != "Median House Value")
dependent_rows <- which(summary_stats$Variable == "Median House Value")
# Determine the start and end rows for each group
start_pred <- min(predictor_rows)
end_pred <- max(predictor_rows)
start_dep <- min(dependent_rows)
end_dep <- max(dependent_rows)
# Create the table using kable and add extra formatting
kable(summary_stats, caption = "Summary Statistics",
align = c("l", "l", "l"), booktabs = TRUE, escape = FALSE ) %>%
add_header_above(c(" " = 1, "Statistics" = 2)) %>%
kable_styling(full_width = FALSE) %>%
group_rows("Predictors", start_pred, end_pred) %>%
group_rows("Dependent Variable", start_dep, end_dep)%>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed"), full_width = TRUE)
Summary Statistics
|
|
Statistics
|
|
Variable
|
Mean
|
SD
|
|
Predictors
|
|
% of Individuals with Bachelor’s Degrees or Higher
|
16.08
|
17.77
|
|
# Households Living in Poverty
|
189.77
|
164.32
|
|
% of Vacant Houses
|
11.29
|
9.63
|
|
% of Single House Units
|
9.23
|
13.25
|
|
Dependent Variable
|
|
Median House Value
|
66287.73
|
60006.08
|
#check 0
columns_to_check <- c(dependent_var, predictors)
zero_counts <- sapply(data[columns_to_check], function(x) sum(x == 0, na.rm = TRUE))
zero_counts[zero_counts > 0]
## PCTBACHMOR NBELPOV100 PCTVACANT PCTSINGLES
## 143 33 163 306
data <- data %>%
mutate(
LNMEDHVAL = log(MEDHVAL),
LNPCTBACHMOR = log(1+PCTBACHMOR),
LNNBELPOV100 = log(1+NBELPOV100),
LNPCTVACANT = log(1+PCTVACANT),
LNPCTSINGLES = log(1+PCTSINGLES)
)
longer_version<- data %>%
pivot_longer(cols = c("MEDHVAL", "PCTBACHMOR", "NBELPOV100", "PCTVACANT", "PCTSINGLES"),
names_to = "Variable",
values_to = "Value")
ggplot(longer_version,aes(x = Value)) +
geom_histogram(aes(y = ..count..), fill = "black", alpha = 0.7) +
facet_wrap(~Variable, scales = "free", ncol = 3, labeller = as_labeller(c(
"MEDHVAL" = "Median House Value",
"PCTBACHMOR" = "% with Bachelor’s Degrees or Higher",
"NBELPOV100" = "# Households Living in Poverty",
"PCTVACANT" = "% of Vacant Houses",
"PCTSINGLES" = "% of Single House Units"
))) +
labs(x = "Value", y = "Count", title = "Histograms of Dependent and Predictor Variables") +
theme_light() +
theme(plot.subtitle = element_text(size = 9,face = "italic"),
plot.title = element_text(size = 12, face = "bold"),
axis.text.x=element_text(size=6),
axis.text.y=element_text(size=6),
axis.title=element_text(size=8))

# histograms of the transformed variables
longer_version2 <- data %>%
pivot_longer(cols = c(LNMEDHVAL, LNPCTBACHMOR ,LNNBELPOV100,LNPCTVACANT, LNPCTSINGLES),
names_to = "Variable",
values_to = "Value")
ggplot(longer_version2,aes(x = Value)) +
geom_histogram(aes(y = ..count..), fill = "red", alpha = 0.7) +
facet_wrap(~Variable, scales = "free", ncol = 3, labeller = as_labeller(c(
"LNMEDHVAL" = "Log Median House Value",
"LNPCTBACHMOR" = "Log % with Bachelor’s Degree",
"LNNBELPOV100" = "Log # Households in Poverty",
"LNPCTVACANT" = "Log % Vacant Houses",
"LNPCTSINGLES" = "Log % Single House Units"
))) +
labs(x = "Value", y = "Count", title = "Histograms of Dependent and log transformed Predictor Variables") +
theme_light() +
theme(plot.subtitle = element_text(size = 9,face = "italic"),
plot.title = element_text(size = 12, face = "bold"),
axis.text.x=element_text(size=6),
axis.text.y=element_text(size=6),
axis.title=element_text(size=8))

ggplot(shape) +
geom_sf(aes(fill = LNMEDHVAL), color = "transparent") +
scale_fill_gradientn(colors = c("#fff0f3", "#a4133c"),
name = "LNMEDHVAL",
na.value = "transparent") +
theme(legend.text = element_text(size = 9),
legend.title = element_text(size = 10),
axis.text.x = element_blank(),
axis.ticks.x = element_blank(),
axis.text.y = element_blank(),
axis.ticks.y = element_blank(),
plot.subtitle = element_text(size = 9, face = "italic"),
plot.title = element_text(size = 12, face = "bold"),
panel.background = element_blank(),
panel.border = element_rect(colour = "grey", fill = NA, size = 0.8)) +
labs(title = "Log Transformed Median House Value")

shpe_longer<- shape %>%
pivot_longer(cols = c("PCTVACANT", "PCTSINGLES", "PCTBACHMOR", "LNNBELPOV"),
names_to = "Variable",
values_to = "Value")
custom_titles <- c(
PCTVACANT = "Percent of Vacant Houses",
PCTSINGLES = "Percent of Single House Units",
PCTBACHMOR = "Percent of Bachelor's Degree or Higher",
LNNBELPOV = "Logged Transformed Poverty Rate"
)
plot_list <- lapply(unique(shpe_longer$Variable), function(var_name) {
data_subset <- subset(shpe_longer, Variable == var_name)
ggplot(data_subset) +
geom_sf(aes(fill = Value), color = "transparent") +
scale_fill_gradientn(
colors = c("#fff0f3", "#a4133c"),
name = var_name,
na.value = "transparent"
) +
labs(title = custom_titles[[var_name]]) +
theme(
legend.text = element_text(size = 8),
legend.title = element_text(size = 10),
legend.key.size = unit(0.3, "cm"),
axis.text.x = element_blank(),
axis.ticks.x = element_blank(),
axis.text.y = element_blank(),
axis.ticks.y = element_blank(),
plot.subtitle = element_text(size = 9, face = "italic"),
plot.title = element_text(size = 15, face = "bold"),
panel.background = element_blank(),
panel.border = element_rect(colour = "grey", fill = NA, size = 0.8)
)
})
# Combine the plots into a grid (2 columns by 2 rows)
combined_plot <- (plot_list[[1]] + plot_list[[2]]) /
(plot_list[[3]] + plot_list[[4]])
combined_plot

fit <- lm(LNMEDHVAL ~ PCTVACANT + PCTSINGLES + PCTBACHMOR + LNNBELPOV100, data=data)
summary(fit)
##
## Call:
## lm(formula = LNMEDHVAL ~ PCTVACANT + PCTSINGLES + PCTBACHMOR +
## LNNBELPOV100, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.25825 -0.20391 0.03822 0.21744 2.24347
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 11.1137661 0.0465330 238.836 < 2e-16 ***
## PCTVACANT -0.0191569 0.0009779 -19.590 < 2e-16 ***
## PCTSINGLES 0.0029769 0.0007032 4.234 2.42e-05 ***
## PCTBACHMOR 0.0209098 0.0005432 38.494 < 2e-16 ***
## LNNBELPOV100 -0.0789054 0.0084569 -9.330 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3665 on 1715 degrees of freedom
## Multiple R-squared: 0.6623, Adjusted R-squared: 0.6615
## F-statistic: 840.9 on 4 and 1715 DF, p-value: < 2.2e-16
anova_table <- anova(fit)
anova_table
## Analysis of Variance Table
##
## Response: LNMEDHVAL
## Df Sum Sq Mean Sq F value Pr(>F)
## PCTVACANT 1 180.392 180.392 1343.087 < 2.2e-16 ***
## PCTSINGLES 1 24.543 24.543 182.734 < 2.2e-16 ***
## PCTBACHMOR 1 235.118 235.118 1750.551 < 2.2e-16 ***
## LNNBELPOV100 1 11.692 11.692 87.054 < 2.2e-16 ***
## Residuals 1715 230.344 0.134
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
fitted_values <- fitted(fit)
residuals_values <- residuals(fit)
standardized_residuals <- rstandard(fit)
data <- data %>%
mutate(
Fitted = fitted_values,
Residuals = residuals_values,
Standardized_Residuals = standardized_residuals)
ggplot(data, aes(x = Fitted, y = Standardized_Residuals)) +
geom_point(color = "black") +
geom_hline(yintercept = 0, linetype = "dashed", color = "red") +
labs(
title = "Scatter Plot of Standardized Residuals vs Fitted Values",
x = "Predicted Values",
y = "Standardized Residuals"
) +
theme_minimal() +
theme(plot.subtitle = element_text(size = 9,face = "italic"),
plot.title = element_text(size = 12, face = "bold"),
axis.text.x=element_text(size=6),
axis.text.y=element_text(size=6),
axis.title=element_text(size=8))

ggplot(data, aes(x = Standardized_Residuals)) +
geom_histogram(bins = 30, fill = "black") +
labs(title = "Histogram of Standardized Residuals",
x = "Standardized Residuals",
y = "Frequency") +
theme_minimal() +
theme(plot.subtitle = element_text(size = 9,face = "italic"),
plot.title = element_text(size = 12, face = "bold"),
axis.text.x=element_text(size=6),
axis.text.y=element_text(size=6),
axis.title=element_text(size=8))

longer<-data %>%
pivot_longer(cols = c("PCTBACHMOR", "LNNBELPOV100", "PCTVACANT", "PCTSINGLES"),
names_to = "Variable",
values_to = "Value")
ggplot(longer,aes(x = Value, y = LNMEDHVAL)) +
geom_point(color = "black") +
geom_smooth(method = "lm", color = "red", se = FALSE) +
facet_wrap(~ Variable, scales = "free", labeller = as_labeller(c(
"PCTBACHMOR" = "% with Bachelor’s Degrees or Higher",
"LNNBELPOV100" = "Logged Households Living in Poverty",
"PCTVACANT" = "% of Vacant Houses",
"PCTSINGLES" = "% of Single House Units"
))) +
theme_light() +
theme(plot.subtitle = element_text(size = 9,face = "italic"),
plot.title = element_text(size = 12, face = "bold"),
axis.text.x=element_text(size=6),
axis.text.y=element_text(size=6),
axis.title=element_text(size=8)) +
labs(title = "Scatter Plots of Dependent Variable vs. Predictors",
x = "Predictor Value",
y = "Log of Median House Value")

custom_labels <- c(
"% of Individuals with Bachelor’s Degrees or Higher" = "PCTBACHMOR",
"% of Vacant Houses" = "PCTVACANT",
"% of Single House Units" = "PCTSINGLES",
"# Households Living in Poverty" = "LNNBELPOV100"
)
predictor_vars <- data[, c("PCTVACANT", "PCTSINGLES", "PCTBACHMOR", "LNNBELPOV100")]
cor_matrix <- cor(predictor_vars, use = "complete.obs", method = "pearson")
rownames(cor_matrix) <- names(custom_labels)
colnames(cor_matrix) <- names(custom_labels)
ggcorrplot(cor_matrix,
method = "square",
type = "lower",
lab = TRUE,
lab_size = 3,
colors = c("#d73027", "white", "#1a9850"))+
labs(title = "Correlation Matrix for all Predictor Variables") +
theme(plot.subtitle = element_text(size = 9, face = "italic"),
plot.title = element_text(size = 12, face = "bold"),
axis.text.x = element_text(size = 7),
axis.text.y = element_text(size = 7),
axis.title = element_text(size = 8))

---
title: 'Using OLS Regression to Predict Median House Values in Philadelphia'
author: "Zhanchao Yang, Haoyu Zhu, Kavana Raju"
date: "`r Sys.Date()`"
output: 
  html_document:
    theme: united
    highlight: tango
    toc: true
    toc_float: true
    code_folding: hide
    code_download: yes
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)

library(tidyverse)
library(sf)
library(tidycensus)
library(knitr) 
library(gt) 
library(ggplot2)
library(dplyr)
library(tidyr)
library(kableExtra)
library(gridExtra)
library(ggcorrplot)
library(patchwork)
```

```{r, warning=FALSE, message=FALSE, include= FALSE}
# Load the data
data <- read.csv("data/RegressionData.csv")
shape <- st_read("data/RegressionData.shp")
```


```{r summary stats, warning=FALSE, message=FALSE}
dependent_var <- "MEDHVAL"

predictors <- c("PCTBACHMOR", "NBELPOV100", "PCTVACANT", "PCTSINGLES")

summary_stats <- data %>%
  select(all_of(c(dependent_var, predictors))) %>%
  summarise_all(list(Mean = mean, SD = sd), na.rm = TRUE) %>%
  pivot_longer(cols = everything(), names_to = "Variable", values_to = "Value") %>%
  separate(Variable, into = c("Variable", "Stat"), sep = "_") %>%
  pivot_wider(names_from = Stat, values_from = Value)



summary_stats$Variable <- recode(summary_stats$Variable,
  "MEDHVAL" = "Median House Value",
  "NBELPOV100" = "# Households Living in Poverty",
  "PCTBACHMOR" = "% of Individuals with Bachelor’s Degrees or Higher",
  "PCTVACANT" = "% of Vacant Houses",
  "PCTSINGLES" = "% of Single House Units"
)



summary_stats <- summary_stats %>%
  mutate(
    Mean = round(Mean, 2),
    SD = round(SD, 2)
  )

summary_stats <- summary_stats %>%
  arrange(Variable == "Median House Value")

predictor_rows <- which(summary_stats$Variable != "Median House Value")
dependent_rows <- which(summary_stats$Variable == "Median House Value")

# Determine the start and end rows for each group
start_pred <- min(predictor_rows)
end_pred   <- max(predictor_rows)
start_dep  <- min(dependent_rows)
end_dep    <- max(dependent_rows)

# Create the table using kable and add extra formatting
kable(summary_stats, caption = "Summary Statistics", 
      align = c("l", "l", "l"), booktabs = TRUE, escape = FALSE ) %>%
  add_header_above(c(" " = 1, "Statistics" = 2)) %>%
  kable_styling(full_width = FALSE) %>%
  group_rows("Predictors", start_pred, end_pred) %>%
  group_rows("Dependent Variable", start_dep, end_dep)%>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"), full_width = TRUE)

```



```{r}
#check 0
columns_to_check <- c(dependent_var, predictors)

zero_counts <- sapply(data[columns_to_check], function(x) sum(x == 0, na.rm = TRUE))

zero_counts[zero_counts > 0]

```

```{r}
data <- data %>%
  mutate(
    LNMEDHVAL = log(MEDHVAL),
    LNPCTBACHMOR = log(1+PCTBACHMOR),
    LNNBELPOV100 = log(1+NBELPOV100),
    LNPCTVACANT = log(1+PCTVACANT),
    LNPCTSINGLES = log(1+PCTSINGLES)
  )
```

```{r, fig.height=7, fig.width=9, warning=FALSE, message=FALSE}
longer_version<- data %>%
  pivot_longer(cols = c("MEDHVAL", "PCTBACHMOR", "NBELPOV100", "PCTVACANT", "PCTSINGLES"),
               names_to = "Variable",
               values_to = "Value")

ggplot(longer_version,aes(x = Value)) +
  geom_histogram(aes(y = ..count..), fill = "black", alpha = 0.7) +  
  facet_wrap(~Variable, scales = "free", ncol = 3, labeller = as_labeller(c(
    "MEDHVAL" = "Median House Value",
    "PCTBACHMOR" = "% with Bachelor’s Degrees or Higher",
    "NBELPOV100" = "# Households Living in Poverty",
    "PCTVACANT" = "% of Vacant Houses",
    "PCTSINGLES" = "% of Single House Units"
  ))) +  
  labs(x = "Value", y = "Count", title = "Histograms of Dependent and Predictor Variables") +
  theme_light() +   
  theme(plot.subtitle = element_text(size = 9,face = "italic"),
        plot.title = element_text(size = 12, face = "bold"), 
        axis.text.x=element_text(size=6),
        axis.text.y=element_text(size=6), 
        axis.title=element_text(size=8))
```


```{r, fig.height=7, fig.width=9, warning=FALSE, message=FALSE}
# histograms of the transformed variables
longer_version2 <- data %>%
  pivot_longer(cols = c(LNMEDHVAL, LNPCTBACHMOR ,LNNBELPOV100,LNPCTVACANT, LNPCTSINGLES),
               names_to = "Variable",
               values_to = "Value")

ggplot(longer_version2,aes(x = Value)) +
  geom_histogram(aes(y = ..count..), fill = "red", alpha = 0.7) +  
  facet_wrap(~Variable, scales = "free", ncol = 3, labeller = as_labeller(c(
    "LNMEDHVAL" = "Log Median House Value",
    "LNPCTBACHMOR" = "Log % with Bachelor’s Degree",
    "LNNBELPOV100" = "Log # Households in Poverty",
    "LNPCTVACANT" = "Log % Vacant Houses",
    "LNPCTSINGLES" = "Log % Single House Units"
  ))) +  
  labs(x = "Value", y = "Count", title = "Histograms of Dependent and log transformed Predictor Variables") +
  theme_light() +   
  theme(plot.subtitle = element_text(size = 9,face = "italic"),
        plot.title = element_text(size = 12, face = "bold"), 
        axis.text.x=element_text(size=6),
        axis.text.y=element_text(size=6), 
        axis.title=element_text(size=8))
```




```{r,fig.height=7, fig.width=9,warning=FALSE, message=FALSE}
ggplot(shape) +
  geom_sf(aes(fill = LNMEDHVAL), color = "transparent") +
  scale_fill_gradientn(colors = c("#fff0f3", "#a4133c"), 
                       name = "LNMEDHVAL", 
                       na.value = "transparent") + 
  theme(legend.text = element_text(size = 9),
        legend.title = element_text(size = 10),
        axis.text.x = element_blank(),
        axis.ticks.x = element_blank(),
        axis.text.y = element_blank(),
        axis.ticks.y = element_blank(),
        plot.subtitle = element_text(size = 9, face = "italic"),
        plot.title = element_text(size = 12, face = "bold"),
        panel.background = element_blank(),
        panel.border = element_rect(colour = "grey", fill = NA, size = 0.8)) +
  labs(title = "Log Transformed Median House Value")
```


```{r, fig.height=12, fig.width=15, warning=FALSE, message=FALSE}
shpe_longer<- shape %>%
  pivot_longer(cols = c("PCTVACANT", "PCTSINGLES", "PCTBACHMOR", "LNNBELPOV"),
               names_to = "Variable",
               values_to = "Value")
custom_titles <- c(
  PCTVACANT   = "Percent of Vacant Houses",
  PCTSINGLES  = "Percent of Single House Units",
  PCTBACHMOR  = "Percent of Bachelor's Degree or Higher",
  LNNBELPOV   = "Logged Transformed Poverty Rate"
)



plot_list <- lapply(unique(shpe_longer$Variable), function(var_name) {
  data_subset <- subset(shpe_longer, Variable == var_name)
  
  ggplot(data_subset) +
    geom_sf(aes(fill = Value), color = "transparent") +
    scale_fill_gradientn(
      colors = c("#fff0f3", "#a4133c"),
      name = var_name,
      na.value = "transparent"
    ) +
    labs(title = custom_titles[[var_name]]) +
    theme(
      legend.text = element_text(size = 8),
      legend.title = element_text(size = 10),
      legend.key.size = unit(0.3, "cm"),
      axis.text.x = element_blank(),
      axis.ticks.x = element_blank(),
      axis.text.y = element_blank(),
      axis.ticks.y = element_blank(),
      plot.subtitle = element_text(size = 9, face = "italic"),
      plot.title = element_text(size = 15, face = "bold"),
      panel.background = element_blank(),
      panel.border = element_rect(colour = "grey", fill = NA, size = 0.8)
    )
})

# Combine the plots into a grid (2 columns by 2 rows)
combined_plot <- (plot_list[[1]] + plot_list[[2]]) /
                 (plot_list[[3]] + plot_list[[4]])

combined_plot
```


```{r regression}
fit <- lm(LNMEDHVAL ~ PCTVACANT + PCTSINGLES + PCTBACHMOR + LNNBELPOV100, data=data)
summary(fit)
```
```{r}
anova_table <- anova(fit)
anova_table
```
```{r}
fitted_values <- fitted(fit)
residuals_values <- residuals(fit)
standardized_residuals <- rstandard(fit)

data <- data %>%
  mutate(
    Fitted = fitted_values,
    Residuals = residuals_values,
    Standardized_Residuals = standardized_residuals)
```

```{r}
ggplot(data, aes(x = Fitted, y = Standardized_Residuals)) +
  geom_point(color = "black") +    
  geom_hline(yintercept = 0, linetype = "dashed", color = "red") +  
  labs(
    title = "Scatter Plot of Standardized Residuals vs Fitted Values",
    x = "Predicted Values",
    y = "Standardized Residuals"
  ) +
  theme_minimal() +   
  theme(plot.subtitle = element_text(size = 9,face = "italic"),
        plot.title = element_text(size = 12, face = "bold"), 
        axis.text.x=element_text(size=6),
        axis.text.y=element_text(size=6), 
        axis.title=element_text(size=8))
```

```{r}
ggplot(data, aes(x = Standardized_Residuals)) +
  geom_histogram(bins = 30, fill = "black") +
  labs(title = "Histogram of Standardized Residuals", 
       x = "Standardized Residuals", 
       y = "Frequency") +
  theme_minimal() +   
  theme(plot.subtitle = element_text(size = 9,face = "italic"),
        plot.title = element_text(size = 12, face = "bold"), 
        axis.text.x=element_text(size=6),
        axis.text.y=element_text(size=6), 
        axis.title=element_text(size=8))
```








```{r fig.height=7, fig.width=9, warning=FALSE, message=FALSE}
longer<-data %>%
  pivot_longer(cols = c("PCTBACHMOR", "LNNBELPOV100", "PCTVACANT", "PCTSINGLES"),
               names_to = "Variable",
               values_to = "Value")

ggplot(longer,aes(x = Value, y = LNMEDHVAL)) +
  geom_point(color = "black") +
  geom_smooth(method = "lm", color = "red", se = FALSE) + 
  facet_wrap(~ Variable, scales = "free", labeller = as_labeller(c(
    "PCTBACHMOR" = "% with Bachelor’s Degrees or Higher",
    "LNNBELPOV100" = "Logged Households Living in Poverty",
    "PCTVACANT" = "% of Vacant Houses",
    "PCTSINGLES" = "% of Single House Units"
  )))  +
  theme_light() +   
  theme(plot.subtitle = element_text(size = 9,face = "italic"),
        plot.title = element_text(size = 12, face = "bold"), 
        axis.text.x=element_text(size=6),
        axis.text.y=element_text(size=6), 
        axis.title=element_text(size=8)) +
  labs(title = "Scatter Plots of Dependent Variable vs. Predictors", 
       x = "Predictor Value", 
       y = "Log of Median House Value")
```






```{r, warning=FALSE, message=FALSE}

custom_labels <- c(
  "% of Individuals with Bachelor’s Degrees or Higher" = "PCTBACHMOR",
  "% of Vacant Houses" = "PCTVACANT",
  "% of Single House Units" = "PCTSINGLES",
  "# Households Living in Poverty" = "LNNBELPOV100"
)

predictor_vars <- data[, c("PCTVACANT", "PCTSINGLES", "PCTBACHMOR", "LNNBELPOV100")]

cor_matrix <- cor(predictor_vars, use = "complete.obs", method = "pearson")
rownames(cor_matrix) <- names(custom_labels)
colnames(cor_matrix) <- names(custom_labels)


ggcorrplot(cor_matrix, 
           method = "square",   
           type = "lower",      
           lab = TRUE,       
           lab_size = 3,      
           colors = c("#d73027", "white", "#1a9850"))+
    labs(title = "Correlation Matrix for all Predictor Variables") +
    theme(plot.subtitle = element_text(size = 9, face = "italic"),
        plot.title = element_text(size = 12, face = "bold"), 
        axis.text.x = element_text(size = 7),
        axis.text.y = element_text(size = 7), 
        axis.title = element_text(size = 8))
```



